{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Edge Probing Data Viewer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys, os, re, json, io\n",
    "import itertools\n",
    "import collections\n",
    "from importlib import reload\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import metrics\n",
    "from tqdm import tqdm\n",
    "\n",
    "from src.utils import utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import display\n",
    "import ipywidgets as widgets\n",
    "\n",
    "class EdgeProbingExample(object):\n",
    "    \n",
    "    def __init__(self, record):\n",
    "        self._data = record\n",
    "        \n",
    "    def __str__(self):\n",
    "        buf = io.StringIO()\n",
    "        text = self._data['text']\n",
    "        tokens = text.split()\n",
    "        buf.write(\"Text ({:d}): {:s}\\n\".format(len(tokens), text))\n",
    "        _fmt_span = lambda s,e: '[{:d},{:d})\\t\"{:s}\"'.format(s, e, \" \".join(tokens[s:e]))\n",
    "        for t in self._data['targets']:\n",
    "            buf.write(\"\\n\")\n",
    "            buf.write(\"  span1: {}\\n\".format(_fmt_span(*t['span1'])))\n",
    "            if 'span2' in t:\n",
    "                buf.write(\"  span2: {}\\n\".format(_fmt_span(*t['span2'])))\n",
    "            labels = utils.wrap_singleton_string(t['label'])\n",
    "            buf.write(\"  label: ({:d})\\t {}\\n\".format(len(labels), \", \".join(labels)))\n",
    "        return buf.getvalue()\n",
    "    \n",
    "    def __repr__(self):\n",
    "        return str(self)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['dep_ewt',\n",
       " 'dpr',\n",
       " 'ner_conll2003',\n",
       " 'ontonotes-coref',\n",
       " 'ontonotes-coref-conll',\n",
       " 'ontonotes-ner',\n",
       " 'spr2',\n",
       " 'srl_conll2005',\n",
       " 'srl_conll2012',\n",
       " 'spr1',\n",
       " 'stats.retokenized.OpenAI.BPE.tsv',\n",
       " 'ih2',\n",
       " 'ontonotes-constituents-old',\n",
       " 'ontonotes-constituents']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "edges_data_path = \"/nfs/jsalt/share/glue_data/edges\"\n",
    "os.listdir(edges_data_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "0it [00:00, ?it/s]\u001b[A\n",
      "10732it [00:00, 63457.25it/s]\u001b[A\n",
      "15680it [00:00, 71487.78it/s]\u001b[A"
     ]
    }
   ],
   "source": [
    "# record_file = \"/nfs/jsalt/share/glue_data/edges/spr1/spr1.dev.json.retokenized.bert-base-uncased\"\n",
    "record_file = \"/nfs/jsalt/share/glue_data/edges/ontonotes-ner/ner_ontonotes_en_dev.json.retokenized.bert-base-cased\"\n",
    "# record_file = \"/nfs/jsalt/home/nkim/edges/srl_conll2005/test.wsj.edges.real.json.notretokenized\"\n",
    "# record_file = \"/nfs/jsalt/share/glue_data/edges/srl_conll2005/test.wsj.edges.json\"\n",
    "# record_file = \"/nfs/jsalt/home/iftenney/spr2_test/dev.edges.json\"\n",
    "# record_file = \"/nfs/jsalt/home/iftenney/spr2_test/dev.edges.json.retokenized.OpenAI.BPE\"\n",
    "records = list(tqdm(utils.load_json_data(record_file)))\n",
    "records = records[-1000:]\n",
    "\n",
    "all_texts = [r['text'] for r in records]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "08437059cac1479dbf89c259dd42817f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "interactive(children=(Dropdown(description='i', options={'In a stark contrast , the bench ##mark 30 - year Tre…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "<function __main__.print_info(i)>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def print_info(i):\n",
    "    print(EdgeProbingExample(records[i]))\n",
    "\n",
    "widgets.interact(print_info, i={t:i for i,t in enumerate(all_texts[:1000])})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
